{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Configuring pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# import numpy and pandas\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "# used for dates\n",
    "import datetime\n",
    "from datetime import datetime, date\n",
    "\n",
    "# Set some pandas options controlling output format\n",
    "pd.set_option('display.notebook_repr_html', False)\n",
    "pd.set_option('display.max_columns', 8)\n",
    "pd.set_option('display.max_rows', 10)\n",
    "pd.set_option('display.width', 65)\n",
    "\n",
    "# bring in matplotlib for graphics\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# Creating Categoricals"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[low, high, medium, medium, high]\n",
       "Categories (3, object): [high, low, medium]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create a categorical directly from a list.  \n",
    "lmh_values = [\"low\", \"high\", \"medium\", \"medium\", \"high\"]\n",
    "lmh_cat = pd.Categorical(lmh_values)\n",
    "lmh_cat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['high', 'low', 'medium'], dtype='object')"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# examine the categories\n",
    "lmh_cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['low', 'high', 'medium', 'medium', 'high'], dtype=object)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# retreive the values\n",
    "lmh_cat.get_values()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 0, 2, 2, 0], dtype=int8)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# .codes shows the integer mapping for each value of the categorical\n",
    "lmh_cat.codes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[low, high, medium, medium, high]\n",
       "Categories (3, object): [low, medium, high]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create from list but explicitly state the categories\n",
    "lmh_cat = pd.Categorical(lmh_values,\n",
    "                         categories=[\"low\", \"medium\", \"high\"])\n",
    "lmh_cat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 2, 1, 1, 2], dtype=int8)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# the codes are...\n",
    "lmh_cat.codes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[low, medium, medium, high, high]\n",
       "Categories (3, object): [low, medium, high]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# sorting is done using the codes underlying each value\n",
    "lmh_cat.sort_values()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       low\n",
       "1      high\n",
       "2    medium\n",
       "3    medium\n",
       "4      high\n",
       "dtype: category\n",
       "Categories (3, object): [high, low, medium]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create a categorical using a Series and dtype\n",
    "cat_series = pd.Series(lmh_values, dtype=\"category\")\n",
    "cat_series"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       low\n",
       "1      high\n",
       "2    medium\n",
       "3    medium\n",
       "4      high\n",
       "dtype: category\n",
       "Categories (3, object): [high, low, medium]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create a categorical using .astype()\n",
    "s = pd.Series(lmh_values)\n",
    "as_cat = s.astype('category')\n",
    "cat_series"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<pandas.core.categorical.CategoricalAccessor object at 0x1063dc7b8>"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# a categorical has a .cat property that lets you access info\n",
    "cat_series.cat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['high', 'low', 'medium'], dtype='object')"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get the index for the categorical\n",
    "cat_series.cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   Values\n",
       "0      65\n",
       "1      49\n",
       "2      56\n",
       "3      43\n",
       "4      43"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create a DataFrame of 100 values\n",
    "np.random.seed(123456)\n",
    "values = np.random.randint(0, 100, 5)\n",
    "bins = pd.DataFrame({ \"Values\": values})\n",
    "bins"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   Values     Group\n",
       "0      65  (60, 70]\n",
       "1      49  (40, 50]\n",
       "2      56  (50, 60]\n",
       "3      43  (40, 50]\n",
       "4      43  (40, 50]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# cut the values into \n",
    "bins['Group'] = pd.cut(values, range(0, 101, 10))\n",
    "bins"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    (60, 70]\n",
       "1    (40, 50]\n",
       "2    (50, 60]\n",
       "3    (40, 50]\n",
       "4    (40, 50]\n",
       "Name: Group, dtype: category\n",
       "Categories (10, interval[int64]): [(0, 10] < (10, 20] < (20, 30] < (30, 40] ... (60, 70] < (70, 80] < (80, 90] < (90, 100]]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# examine the categorical that was created\n",
    "bins.Group"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[bronze, gold, silver, bronze]\n",
       "Categories (3, object): [bronze < silver < gold]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create an ordered categorical of precious metals\n",
    "# order is important for determining relative value\n",
    "metal_values = [\"bronze\", \"gold\", \"silver\", \"bronze\"]\n",
    "metal_categories = [\"bronze\", \"silver\", \"gold\"]\n",
    "metals = pd.Categorical(metal_values,\n",
    "                        categories=metal_categories,\n",
    "                        ordered = True)\n",
    "metals"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[bronze, silver, gold, bronze]\n",
       "Categories (3, object): [bronze < silver < gold]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# reverse the metals\n",
    "metals_reversed_values = pd.Categorical(\n",
    "    metals.get_values()[::-1],\n",
    "    categories = metals.categories, \n",
    "    ordered=True)\n",
    "metals_reversed_values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ True, False,  True,  True], dtype=bool)"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# compare the two categoricals\n",
    "metals <= metals_reversed_values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 2, 1, 0], dtype=int8)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# codes are the integer value assocaited with each item\n",
    "metals.codes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 1, 2, 0], dtype=int8)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# and for metals2\n",
    "metals_reversed_values.codes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[bronze, NaN]\n",
       "Categories (3, object): [bronze, silver, gold]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# creating a categorical with a non existent category\n",
    "pd.Categorical([\"bronze\", \"copper\"],\n",
    "               categories=metal_categories)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Renaming Categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[a, b, c, a]\n",
       "Categories (3, object): [a, b, c]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create a categorical with 3 categories\n",
    "cat = pd.Categorical([\"a\",\"b\",\"c\",\"a\"], \n",
    "                     categories=[\"a\", \"b\", \"c\"])\n",
    "cat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[bronze, silver, gold, bronze]\n",
       "Categories (3, object): [bronze, silver, gold]"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# renames the categories (and also the values)\n",
    "cat.categories = [\"bronze\", \"silver\", \"gold\"]\n",
    "cat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[x, y, z, x]\n",
       "Categories (3, object): [x, y, z]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# this also renames \n",
    "cat.rename_categories([\"x\", \"y\", \"z\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[bronze, silver, gold, bronze]\n",
       "Categories (3, object): [bronze, silver, gold]"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# the rename is not done in-place\n",
    "cat"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Appending new categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[bronze, gold, silver, bronze]\n",
       "Categories (4, object): [bronze < silver < gold < platinum]"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# add a new platimnum category\n",
    "with_platinum = metals.add_categories([\"platinum\"])\n",
    "with_platinum"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Removing Categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[NaN, gold, silver, NaN]\n",
       "Categories (2, object): [silver < gold]"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# remove bronze category\n",
    "no_bronze = metals.remove_categories([\"bronze\"])\n",
    "no_bronze"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Removing unused categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[bronze, gold, silver, bronze]\n",
       "Categories (3, object): [bronze < silver < gold]"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# remove any unused categories (in this case, platinum)\n",
    "with_platinum.remove_unused_categories()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Setting categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     one\n",
       "1     two\n",
       "2    four\n",
       "3    five\n",
       "dtype: category\n",
       "Categories (4, object): [five, four, one, two]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# sample Series\n",
    "s = pd.Series([\"one\",\"two\",\"four\", \"five\"], dtype=\"category\")\n",
    "s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     one\n",
       "1     NaN\n",
       "2    four\n",
       "3     NaN\n",
       "dtype: category\n",
       "Categories (2, object): [one, four]"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# remove the \"two\", \"three\" and \"five\" categories (replaced with NaN)\n",
    "s = s.cat.set_categories([\"one\",\"four\"])\n",
    "s"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Describe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "            counts  freqs\n",
       "categories               \n",
       "bronze           2   0.50\n",
       "silver           1   0.25\n",
       "gold             1   0.25"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get descriptive info on the metals categorical\n",
    "metals.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Value counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "bronze    2\n",
       "silver    1\n",
       "gold      1\n",
       "dtype: int64"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# count the values in the categorical\n",
    "metals.value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Minimum, maximum and mode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('bronze', 'gold', [bronze]\n",
       " Categories (3, object): [bronze < silver < gold])"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# find the min, max and mode of the metals categorical\n",
    "(metals.min(), metals.max(), metals.mode())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# Munging school grades"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   Grade    Name\n",
       "0     51   Ivana\n",
       "1     92  Norris\n",
       "2    100    Ruth\n",
       "3     99    Lane\n",
       "4     93    Skye\n",
       "5     97     Sol\n",
       "6     93   Dylan\n",
       "7     77  Katina\n",
       "8     82  Alissa\n",
       "9     73    Marc"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 10 students with random grades\n",
    "np.random.seed(123456)\n",
    "names = ['Ivana', 'Norris', 'Ruth', 'Lane', 'Skye', 'Sol', \n",
    "         'Dylan', 'Katina', 'Alissa', \"Marc\"]\n",
    "grades = np.random.randint(50, 101, len(names))\n",
    "scores = pd.DataFrame({'Name': names, 'Grade': grades})\n",
    "scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# bins and their mappings to letter grades\n",
    "score_bins =    [ 0,  59,   62,  66,   69,   72,  76,   79,   82,  \n",
    "                 86,   89,   92,  99, 100]\n",
    "letter_grades = ['F', 'D-', 'D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', \n",
    "                 'B+', 'A-', 'A', 'A+']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   Grade    Name Letter\n",
       "0     51   Ivana      F\n",
       "1     92  Norris     A-\n",
       "2    100    Ruth     A+\n",
       "3     99    Lane      A\n",
       "4     93    Skye      A\n",
       "5     97     Sol      A\n",
       "6     93   Dylan      A\n",
       "7     77  Katina     C+\n",
       "8     82  Alissa     B-\n",
       "9     73    Marc      C"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# cut based upon the bins and assign the letter grade\n",
    "letter_cats = pd.cut(scores.Grade, score_bins, labels=letter_grades)\n",
    "scores['Letter'] = letter_cats\n",
    "scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     F\n",
       "1    A-\n",
       "2    A+\n",
       "3     A\n",
       "4     A\n",
       "5     A\n",
       "6     A\n",
       "7    C+\n",
       "8    B-\n",
       "9     C\n",
       "Name: Grade, dtype: category\n",
       "Categories (13, object): [F < D- < D < D+ ... B+ < A- < A < A+]"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# examine the underlying categorical\n",
    "letter_cats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "A     4\n",
       "A+    1\n",
       "A-    1\n",
       "B-    1\n",
       "C+    1\n",
       "     ..\n",
       "B     0\n",
       "C-    0\n",
       "D+    0\n",
       "D     0\n",
       "D-    0\n",
       "Name: Letter, Length: 13, dtype: int64"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# how many of each grade occurred?\n",
    "scores.Letter.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   Grade    Name Letter\n",
       "2    100    Ruth     A+\n",
       "6     93   Dylan      A\n",
       "5     97     Sol      A\n",
       "4     93    Skye      A\n",
       "3     99    Lane      A\n",
       "1     92  Norris     A-\n",
       "8     82  Alissa     B-\n",
       "7     77  Katina     C+\n",
       "9     73    Marc      C\n",
       "0     51   Ivana      F"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# and sort by letter grade instead of numeric grade\n",
    "scores.sort_values(by=['Letter'], ascending=False)"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
